'''
Created on 20-Apr-2013

@author: cdac
'''
import string
from nltk.corpus import stopwords

class PreprocessSMS:
    '''
    All preprocessing related to SMS
    '''


    def __init__(self):
        '''
        Constructor
        '''
        pass
    
    
    def all_preprocess_fucntions(self, msgs_list):
        msgs_list = self.remove_puncts_and_special_chars(msgs_list)
        msgs_list = self.convert_to_lowercase(msgs_list)
        msgs_list = self.remove_stopwords(msgs_list)
        return msgs_list
        
    def remove_stopwords(self, msgs_list):
        clean_msgs = list()
        english_stops = set(stopwords.words('english'))
        
        for msg in msgs_list:
            clean_msg = ""

            for word in msg.split(" "):
                if word not in english_stops:
                    clean_msg += word + " " 
            clean_msgs.append(clean_msg)
        return clean_msgs
    
    def remove_puncts_and_special_chars(self, msgs_list):
        clean_msgs = list()
        
        for msg in msgs_list:            
            msg = string.strip(msg, string.punctuation)
            
            special_chars = "!@#$%^&*()_+=-~`<>?,./;'{}|[]\""
            
            for char in msg:
                if char in special_chars:
                    msg = msg.replace(char, '') # replace by space
                
            clean_msgs.append(msg)
            
        return clean_msgs
    
    def convert_to_lowercase(self, msgs_list):
        lowercase_msgs = list()
        
        for msg in msgs_list:
            msg = string.lower(msg)
            lowercase_msgs.append(msg)
        
        return lowercase_msgs
    
    
if __name__ == '__main__':
    msgs = ['How are you', 'I am doing good', 'My email ID is maheshp@cdac.in']

    process = PreprocessSMS()
    
    print msgs
    msgs =  process.remove_puncts_and_special_chars(msgs)
    print process.convert_to_lowercase(msgs)
    
    print process.remove_stopwords(msgs)
    
    msgs = ['How are you', 'I am doing good', 'My email ID is maheshp@cdac.in']
    print process.all_preprocess_fucntions(msgs)